deepresearch-flow 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- deepresearch_flow/paper/db.py +34 -0
- deepresearch_flow/paper/web/app.py +106 -1
- deepresearch_flow/paper/web/constants.py +5 -4
- deepresearch_flow/paper/web/handlers/__init__.py +2 -1
- deepresearch_flow/paper/web/handlers/api.py +55 -0
- deepresearch_flow/paper/web/handlers/pages.py +105 -25
- deepresearch_flow/paper/web/markdown.py +60 -0
- deepresearch_flow/paper/web/pdfjs/web/viewer.html +57 -5
- deepresearch_flow/paper/web/pdfjs/web/viewer.js +5 -1
- deepresearch_flow/paper/web/static/js/detail.js +494 -125
- deepresearch_flow/paper/web/static/js/outline.js +48 -34
- deepresearch_flow/paper/web/static_assets.py +289 -0
- deepresearch_flow/paper/web/templates/detail.html +46 -69
- deepresearch_flow/paper/web/templates/index.html +3 -3
- deepresearch_flow/paper/web/templates.py +7 -4
- deepresearch_flow/recognize/cli.py +805 -26
- deepresearch_flow/recognize/katex_check.js +29 -0
- deepresearch_flow/recognize/math.py +719 -0
- deepresearch_flow/recognize/mermaid.py +690 -0
- {deepresearch_flow-0.4.0.dist-info → deepresearch_flow-0.5.0.dist-info}/METADATA +117 -4
- {deepresearch_flow-0.4.0.dist-info → deepresearch_flow-0.5.0.dist-info}/RECORD +25 -21
- {deepresearch_flow-0.4.0.dist-info → deepresearch_flow-0.5.0.dist-info}/WHEEL +0 -0
- {deepresearch_flow-0.4.0.dist-info → deepresearch_flow-0.5.0.dist-info}/entry_points.txt +0 -0
- {deepresearch_flow-0.4.0.dist-info → deepresearch_flow-0.5.0.dist-info}/licenses/LICENSE +0 -0
- {deepresearch_flow-0.4.0.dist-info → deepresearch_flow-0.5.0.dist-info}/top_level.txt +0 -0
deepresearch_flow/paper/db.py
CHANGED
|
@@ -600,6 +600,32 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
600
600
|
)
|
|
601
601
|
@click.option("--cache-dir", "cache_dir", default=None, help="Cache directory for merged inputs")
|
|
602
602
|
@click.option("--no-cache", "no_cache", is_flag=True, help="Disable cache for db serve")
|
|
603
|
+
@click.option(
|
|
604
|
+
"--static-base-url",
|
|
605
|
+
"static_base_url",
|
|
606
|
+
default=None,
|
|
607
|
+
help="Static asset base URL (e.g. https://static.example.com)",
|
|
608
|
+
)
|
|
609
|
+
@click.option(
|
|
610
|
+
"--static-mode",
|
|
611
|
+
"static_mode",
|
|
612
|
+
type=click.Choice(["auto", "dev", "prod"]),
|
|
613
|
+
default="auto",
|
|
614
|
+
show_default=True,
|
|
615
|
+
help="Static asset mode (dev uses local assets, prod uses static base URL)",
|
|
616
|
+
)
|
|
617
|
+
@click.option(
|
|
618
|
+
"--static-export-dir",
|
|
619
|
+
"static_export_dir",
|
|
620
|
+
default=None,
|
|
621
|
+
help="Optional export directory for hashed static assets",
|
|
622
|
+
)
|
|
623
|
+
@click.option(
|
|
624
|
+
"--pdfjs-cdn-base-url",
|
|
625
|
+
"pdfjs_cdn_base_url",
|
|
626
|
+
default=None,
|
|
627
|
+
help="PDF.js CDN base URL (defaults to jsDelivr)",
|
|
628
|
+
)
|
|
603
629
|
@click.option("--host", default="127.0.0.1", show_default=True, help="Bind host")
|
|
604
630
|
@click.option("--port", default=8000, type=int, show_default=True, help="Bind port")
|
|
605
631
|
@click.option(
|
|
@@ -617,6 +643,10 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
617
643
|
pdf_roots: tuple[str, ...],
|
|
618
644
|
cache_dir: str | None,
|
|
619
645
|
no_cache: bool,
|
|
646
|
+
static_base_url: str | None,
|
|
647
|
+
static_mode: str,
|
|
648
|
+
static_export_dir: str | None,
|
|
649
|
+
pdfjs_cdn_base_url: str | None,
|
|
620
650
|
host: str,
|
|
621
651
|
port: int,
|
|
622
652
|
fallback_language: str,
|
|
@@ -635,6 +665,10 @@ def register_db_commands(db_group: click.Group) -> None:
|
|
|
635
665
|
pdf_roots=[Path(root) for root in pdf_roots],
|
|
636
666
|
cache_dir=Path(cache_dir) if cache_dir else None,
|
|
637
667
|
use_cache=not no_cache,
|
|
668
|
+
static_base_url=static_base_url,
|
|
669
|
+
static_mode=static_mode,
|
|
670
|
+
static_export_dir=Path(static_export_dir) if static_export_dir else None,
|
|
671
|
+
pdfjs_cdn_base_url=pdfjs_cdn_base_url,
|
|
638
672
|
)
|
|
639
673
|
except Exception as exc:
|
|
640
674
|
raise click.ClickException(str(exc)) from exc
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import logging
|
|
4
|
+
import os
|
|
4
5
|
from pathlib import Path
|
|
5
6
|
|
|
6
7
|
from starlette.applications import Starlette
|
|
@@ -10,8 +11,9 @@ from starlette.routing import Mount, Route
|
|
|
10
11
|
from starlette.staticfiles import StaticFiles
|
|
11
12
|
|
|
12
13
|
from deepresearch_flow.paper.db_ops import build_index, load_and_merge_papers
|
|
13
|
-
from deepresearch_flow.paper.web.constants import PDFJS_STATIC_DIR, STATIC_DIR
|
|
14
|
+
from deepresearch_flow.paper.web.constants import DEFAULT_PDFJS_CDN_BASE_URL, PDFJS_STATIC_DIR, STATIC_DIR
|
|
14
15
|
from deepresearch_flow.paper.web.handlers import (
|
|
16
|
+
api_markdown,
|
|
15
17
|
api_papers,
|
|
16
18
|
api_pdf,
|
|
17
19
|
api_stats,
|
|
@@ -21,6 +23,7 @@ from deepresearch_flow.paper.web.handlers import (
|
|
|
21
23
|
stats_page,
|
|
22
24
|
)
|
|
23
25
|
from deepresearch_flow.paper.web.markdown import create_md_renderer
|
|
26
|
+
from deepresearch_flow.paper.web.static_assets import build_static_assets
|
|
24
27
|
|
|
25
28
|
logger = logging.getLogger(__name__)
|
|
26
29
|
|
|
@@ -32,6 +35,35 @@ class _NoIndexMiddleware(BaseHTTPMiddleware):
|
|
|
32
35
|
return response
|
|
33
36
|
|
|
34
37
|
|
|
38
|
+
class _StaticAssetFiles(StaticFiles):
|
|
39
|
+
def __init__(self, *args, cache_control: str | None = None, **kwargs) -> None:
|
|
40
|
+
super().__init__(*args, **kwargs)
|
|
41
|
+
self._cache_control = cache_control
|
|
42
|
+
|
|
43
|
+
async def get_response(self, path: str, scope): # type: ignore[override]
|
|
44
|
+
response = await super().get_response(path, scope)
|
|
45
|
+
if self._cache_control and response.status_code == 200:
|
|
46
|
+
response.headers.setdefault("Cache-Control", self._cache_control)
|
|
47
|
+
return response
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _normalize_static_mode(value: str | None) -> str:
|
|
51
|
+
if not value:
|
|
52
|
+
return "auto"
|
|
53
|
+
normalized = value.strip().lower()
|
|
54
|
+
if normalized in {"dev", "development"}:
|
|
55
|
+
return "dev"
|
|
56
|
+
if normalized in {"prod", "production"}:
|
|
57
|
+
return "prod"
|
|
58
|
+
return "auto"
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _resolve_static_mode(value: str, static_base_url: str | None) -> str:
|
|
62
|
+
if value == "auto":
|
|
63
|
+
return "prod" if static_base_url else "dev"
|
|
64
|
+
return value
|
|
65
|
+
|
|
66
|
+
|
|
35
67
|
def create_app(
|
|
36
68
|
*,
|
|
37
69
|
db_paths: list[Path],
|
|
@@ -42,6 +74,10 @@ def create_app(
|
|
|
42
74
|
pdf_roots: list[Path] | None = None,
|
|
43
75
|
cache_dir: Path | None = None,
|
|
44
76
|
use_cache: bool = True,
|
|
77
|
+
static_base_url: str | None = None,
|
|
78
|
+
static_mode: str | None = None,
|
|
79
|
+
static_export_dir: Path | None = None,
|
|
80
|
+
pdfjs_cdn_base_url: str | None = None,
|
|
45
81
|
) -> Starlette:
|
|
46
82
|
papers = load_and_merge_papers(db_paths, bibtex_path, cache_dir, use_cache, pdf_roots=pdf_roots)
|
|
47
83
|
|
|
@@ -55,6 +91,44 @@ def create_app(
|
|
|
55
91
|
pdf_roots=pdf_roots,
|
|
56
92
|
)
|
|
57
93
|
md = create_md_renderer()
|
|
94
|
+
static_base_url = static_base_url or os.getenv("PAPER_DB_STATIC_BASE_URL")
|
|
95
|
+
static_mode = _normalize_static_mode(static_mode or os.getenv("PAPER_DB_STATIC_MODE"))
|
|
96
|
+
resolved_mode = _resolve_static_mode(static_mode, static_base_url)
|
|
97
|
+
export_dir_value = static_export_dir or os.getenv("PAPER_DB_STATIC_EXPORT_DIR")
|
|
98
|
+
export_dir = Path(export_dir_value) if export_dir_value else None
|
|
99
|
+
pdfjs_cdn_base_url = (
|
|
100
|
+
pdfjs_cdn_base_url
|
|
101
|
+
or os.getenv("PAPER_DB_PDFJS_CDN_BASE_URL")
|
|
102
|
+
or DEFAULT_PDFJS_CDN_BASE_URL
|
|
103
|
+
)
|
|
104
|
+
if pdfjs_cdn_base_url:
|
|
105
|
+
lowered = pdfjs_cdn_base_url.strip().lower()
|
|
106
|
+
if lowered in {"none", "off", "local"}:
|
|
107
|
+
pdfjs_cdn_base_url = None
|
|
108
|
+
else:
|
|
109
|
+
pdfjs_cdn_base_url = pdfjs_cdn_base_url.rstrip("/")
|
|
110
|
+
|
|
111
|
+
asset_config = None
|
|
112
|
+
if resolved_mode == "prod":
|
|
113
|
+
if not static_base_url:
|
|
114
|
+
logger.warning("Static mode set to prod without base URL; falling back to dev asset routes.")
|
|
115
|
+
resolved_mode = "dev"
|
|
116
|
+
else:
|
|
117
|
+
asset_config = build_static_assets(
|
|
118
|
+
index,
|
|
119
|
+
static_base_url=static_base_url,
|
|
120
|
+
static_export_dir=export_dir,
|
|
121
|
+
)
|
|
122
|
+
if resolved_mode == "dev" and export_dir:
|
|
123
|
+
asset_config = build_static_assets(
|
|
124
|
+
index,
|
|
125
|
+
static_base_url="",
|
|
126
|
+
static_export_dir=export_dir,
|
|
127
|
+
allow_empty_base=True,
|
|
128
|
+
)
|
|
129
|
+
if asset_config is None:
|
|
130
|
+
asset_config = build_static_assets(index, static_base_url=None)
|
|
131
|
+
|
|
58
132
|
routes = [
|
|
59
133
|
Route("/", index_page, methods=["GET"]),
|
|
60
134
|
Route("/robots.txt", robots_txt, methods=["GET"]),
|
|
@@ -63,6 +137,7 @@ def create_app(
|
|
|
63
137
|
Route("/api/papers", api_papers, methods=["GET"]),
|
|
64
138
|
Route("/api/stats", api_stats, methods=["GET"]),
|
|
65
139
|
Route("/api/pdf/{source_hash:str}", api_pdf, methods=["GET"]),
|
|
140
|
+
Route("/api/dev/markdown/{source_hash:str}", api_markdown, methods=["GET"]),
|
|
66
141
|
]
|
|
67
142
|
if PDFJS_STATIC_DIR.exists():
|
|
68
143
|
routes.append(
|
|
@@ -85,10 +160,40 @@ def create_app(
|
|
|
85
160
|
name="static",
|
|
86
161
|
)
|
|
87
162
|
)
|
|
163
|
+
if export_dir and export_dir.exists() and asset_config.enabled and not asset_config.base_url:
|
|
164
|
+
cache_header = "public, max-age=31536000, immutable"
|
|
165
|
+
routes.extend(
|
|
166
|
+
[
|
|
167
|
+
Mount(
|
|
168
|
+
"/pdf",
|
|
169
|
+
app=_StaticAssetFiles(directory=str(export_dir / "pdf"), cache_control=cache_header),
|
|
170
|
+
name="static_pdf",
|
|
171
|
+
),
|
|
172
|
+
Mount(
|
|
173
|
+
"/images",
|
|
174
|
+
app=_StaticAssetFiles(directory=str(export_dir / "images"), cache_control=cache_header),
|
|
175
|
+
name="static_images",
|
|
176
|
+
),
|
|
177
|
+
Mount(
|
|
178
|
+
"/md",
|
|
179
|
+
app=_StaticAssetFiles(directory=str(export_dir / "md"), cache_control=cache_header),
|
|
180
|
+
name="static_md",
|
|
181
|
+
),
|
|
182
|
+
Mount(
|
|
183
|
+
"/md_translate",
|
|
184
|
+
app=_StaticAssetFiles(directory=str(export_dir / "md_translate"), cache_control=cache_header),
|
|
185
|
+
name="static_md_translate",
|
|
186
|
+
),
|
|
187
|
+
]
|
|
188
|
+
)
|
|
88
189
|
app = Starlette(routes=routes)
|
|
89
190
|
app.add_middleware(_NoIndexMiddleware)
|
|
90
191
|
app.state.index = index
|
|
91
192
|
app.state.md = md
|
|
92
193
|
app.state.fallback_language = fallback_language
|
|
93
194
|
app.state.pdf_roots = pdf_roots
|
|
195
|
+
app.state.static_mode = resolved_mode
|
|
196
|
+
app.state.asset_config = asset_config
|
|
197
|
+
app.state.static_export_dir = export_dir
|
|
198
|
+
app.state.pdfjs_cdn_base_url = pdfjs_cdn_base_url
|
|
94
199
|
return app
|
|
@@ -4,14 +4,15 @@ from pathlib import Path
|
|
|
4
4
|
|
|
5
5
|
# CDN URLs for external libraries
|
|
6
6
|
CDN_ECHARTS = "https://cdn.jsdelivr.net/npm/echarts@5/dist/echarts.min.js"
|
|
7
|
-
CDN_MERMAID = "https://cdn.jsdelivr.net/npm/mermaid@
|
|
8
|
-
CDN_KATEX = "https://cdn.jsdelivr.net/npm/katex@0.16.
|
|
9
|
-
CDN_KATEX_JS = "https://cdn.jsdelivr.net/npm/katex@0.16.
|
|
10
|
-
CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.
|
|
7
|
+
CDN_MERMAID = "https://cdn.jsdelivr.net/npm/mermaid@11/dist/mermaid.min.js"
|
|
8
|
+
CDN_KATEX = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/katex.min.css"
|
|
9
|
+
CDN_KATEX_JS = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/katex.min.js"
|
|
10
|
+
CDN_KATEX_AUTO = "https://cdn.jsdelivr.net/npm/katex@0.16.27/dist/contrib/auto-render.min.js"
|
|
11
11
|
|
|
12
12
|
# Use legacy builds to ensure `pdfjsLib` is available as a global.
|
|
13
13
|
CDN_PDFJS = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.min.js"
|
|
14
14
|
CDN_PDFJS_WORKER = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174/legacy/build/pdf.worker.min.js"
|
|
15
|
+
DEFAULT_PDFJS_CDN_BASE_URL = "https://cdn.jsdelivr.net/npm/pdfjs-dist@3.11.174"
|
|
15
16
|
|
|
16
17
|
# PDF.js viewer configuration
|
|
17
18
|
PDFJS_VIEWER_PATH = "/pdfjs/web/viewer.html"
|
|
@@ -1,12 +1,13 @@
|
|
|
1
1
|
"""Route handlers for paper web UI."""
|
|
2
2
|
|
|
3
|
-
from .api import api_papers, api_pdf, api_stats
|
|
3
|
+
from .api import api_markdown, api_papers, api_pdf, api_stats
|
|
4
4
|
from .pages import index_page, paper_detail, robots_txt, stats_page
|
|
5
5
|
|
|
6
6
|
__all__ = [
|
|
7
7
|
"api_papers",
|
|
8
8
|
"api_pdf",
|
|
9
9
|
"api_stats",
|
|
10
|
+
"api_markdown",
|
|
10
11
|
"index_page",
|
|
11
12
|
"paper_detail",
|
|
12
13
|
"robots_txt",
|
|
@@ -19,6 +19,8 @@ from deepresearch_flow.paper.web.filters import (
|
|
|
19
19
|
presence_filter,
|
|
20
20
|
sorted_ids,
|
|
21
21
|
)
|
|
22
|
+
from deepresearch_flow.paper.web.markdown import normalize_markdown_images
|
|
23
|
+
from deepresearch_flow.paper.web.static_assets import resolve_asset_urls
|
|
22
24
|
from deepresearch_flow.paper.web.text import extract_summary_snippet, normalize_title, normalize_venue
|
|
23
25
|
from deepresearch_flow.paper.web.query import Query, QueryTerm, parse_query
|
|
24
26
|
|
|
@@ -92,9 +94,18 @@ def _apply_query(index: PaperIndex, query: Query) -> set[int]:
|
|
|
92
94
|
return result
|
|
93
95
|
|
|
94
96
|
|
|
97
|
+
def _safe_read_text(path: Path) -> str:
|
|
98
|
+
try:
|
|
99
|
+
return path.read_text(encoding="utf-8")
|
|
100
|
+
except UnicodeDecodeError:
|
|
101
|
+
return path.read_text(encoding="latin-1")
|
|
102
|
+
|
|
103
|
+
|
|
95
104
|
async def api_papers(request: Request) -> JSONResponse:
|
|
96
105
|
"""API endpoint for paper list with filtering, sorting, and pagination."""
|
|
97
106
|
index: PaperIndex = request.app.state.index
|
|
107
|
+
asset_config = request.app.state.asset_config
|
|
108
|
+
prefer_local = request.app.state.static_mode == "dev"
|
|
98
109
|
filters = parse_filters(request)
|
|
99
110
|
page = int(filters["page"])
|
|
100
111
|
page_size = int(filters["page_size"])
|
|
@@ -165,6 +176,7 @@ async def api_papers(request: Request) -> JSONResponse:
|
|
|
165
176
|
source_hash = str(paper.get("source_hash") or stable_hash(str(paper.get("source_path") or idx)))
|
|
166
177
|
translations = index.translated_md_by_hash.get(source_hash, {})
|
|
167
178
|
translation_languages = sorted(translations.keys(), key=str.lower)
|
|
179
|
+
asset_urls = resolve_asset_urls(index, source_hash, asset_config, prefer_local=prefer_local)
|
|
168
180
|
items.append(
|
|
169
181
|
{
|
|
170
182
|
"source_hash": source_hash,
|
|
@@ -183,6 +195,10 @@ async def api_papers(request: Request) -> JSONResponse:
|
|
|
183
195
|
"has_summary": bool(paper.get("_has_summary")),
|
|
184
196
|
"is_pdf_only": bool(paper.get("_is_pdf_only")),
|
|
185
197
|
"translation_languages": translation_languages,
|
|
198
|
+
"pdf_url": asset_urls["pdf_url"],
|
|
199
|
+
"md_url": asset_urls["md_url"],
|
|
200
|
+
"md_translated_url": asset_urls["md_translated_url"],
|
|
201
|
+
"images_base_url": asset_urls["images_base_url"],
|
|
186
202
|
}
|
|
187
203
|
)
|
|
188
204
|
|
|
@@ -215,3 +231,42 @@ async def api_pdf(request: Request) -> Response:
|
|
|
215
231
|
if allowed_roots and not _ensure_under_roots(pdf_path, allowed_roots):
|
|
216
232
|
return Response("Forbidden", status_code=403)
|
|
217
233
|
return FileResponse(pdf_path)
|
|
234
|
+
|
|
235
|
+
|
|
236
|
+
async def api_markdown(request: Request) -> Response:
|
|
237
|
+
"""Dev-only API endpoint to serve raw markdown content."""
|
|
238
|
+
if request.app.state.static_mode != "dev":
|
|
239
|
+
return Response("Not Found", status_code=404)
|
|
240
|
+
index: PaperIndex = request.app.state.index
|
|
241
|
+
asset_config = request.app.state.asset_config
|
|
242
|
+
export_dir = request.app.state.static_export_dir
|
|
243
|
+
source_hash = request.path_params["source_hash"]
|
|
244
|
+
lang = request.query_params.get("lang")
|
|
245
|
+
md_path = None
|
|
246
|
+
if export_dir and asset_config and asset_config.enabled and (asset_config.base_url or "") == "":
|
|
247
|
+
if lang:
|
|
248
|
+
translated_url = asset_config.translated_md_urls.get(source_hash, {}).get(lang.lower())
|
|
249
|
+
if translated_url:
|
|
250
|
+
rel_path = translated_url.lstrip("/")
|
|
251
|
+
export_path = export_dir / rel_path
|
|
252
|
+
if export_path.exists():
|
|
253
|
+
raw = _safe_read_text(export_path)
|
|
254
|
+
return Response(raw, media_type="text/markdown")
|
|
255
|
+
else:
|
|
256
|
+
md_url = asset_config.md_urls.get(source_hash)
|
|
257
|
+
if md_url:
|
|
258
|
+
rel_path = md_url.lstrip("/")
|
|
259
|
+
export_path = export_dir / rel_path
|
|
260
|
+
if export_path.exists():
|
|
261
|
+
raw = _safe_read_text(export_path)
|
|
262
|
+
return Response(raw, media_type="text/markdown")
|
|
263
|
+
if lang:
|
|
264
|
+
md_path = index.translated_md_by_hash.get(source_hash, {}).get(lang.lower())
|
|
265
|
+
else:
|
|
266
|
+
md_path = index.md_path_by_hash.get(source_hash)
|
|
267
|
+
if not md_path:
|
|
268
|
+
return Response("Markdown not found", status_code=404)
|
|
269
|
+
raw = _safe_read_text(md_path)
|
|
270
|
+
if lang:
|
|
271
|
+
raw = normalize_markdown_images(raw)
|
|
272
|
+
return Response(raw, media_type="text/markdown")
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import html
|
|
6
|
+
from pathlib import Path
|
|
6
7
|
from urllib.parse import urlencode
|
|
7
8
|
|
|
8
9
|
from starlette.requests import Request
|
|
@@ -16,6 +17,7 @@ from deepresearch_flow.paper.web.markdown import (
|
|
|
16
17
|
render_paper_markdown,
|
|
17
18
|
select_template_tag,
|
|
18
19
|
)
|
|
20
|
+
from deepresearch_flow.paper.web.static_assets import resolve_asset_urls
|
|
19
21
|
from deepresearch_flow.paper.web.text import normalize_title
|
|
20
22
|
from deepresearch_flow.paper.web.templates import (
|
|
21
23
|
build_pdfjs_viewer_url,
|
|
@@ -23,6 +25,47 @@ from deepresearch_flow.paper.web.templates import (
|
|
|
23
25
|
)
|
|
24
26
|
|
|
25
27
|
|
|
28
|
+
def _safe_read_text(path: Path) -> str:
|
|
29
|
+
try:
|
|
30
|
+
return path.read_text(encoding="utf-8")
|
|
31
|
+
except UnicodeDecodeError:
|
|
32
|
+
return path.read_text(encoding="latin-1")
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def _load_markdown_for_view(
|
|
36
|
+
index: PaperIndex,
|
|
37
|
+
asset_config,
|
|
38
|
+
export_dir: Path | None,
|
|
39
|
+
source_hash: str,
|
|
40
|
+
*,
|
|
41
|
+
lang: str | None = None,
|
|
42
|
+
) -> str | None:
|
|
43
|
+
if export_dir and asset_config and asset_config.enabled and (asset_config.base_url or "") == "":
|
|
44
|
+
if lang:
|
|
45
|
+
translated_url = asset_config.translated_md_urls.get(source_hash, {}).get(lang.lower())
|
|
46
|
+
if translated_url:
|
|
47
|
+
export_path = export_dir / translated_url.lstrip("/")
|
|
48
|
+
if export_path.exists():
|
|
49
|
+
return _safe_read_text(export_path)
|
|
50
|
+
else:
|
|
51
|
+
md_url = asset_config.md_urls.get(source_hash)
|
|
52
|
+
if md_url:
|
|
53
|
+
export_path = export_dir / md_url.lstrip("/")
|
|
54
|
+
if export_path.exists():
|
|
55
|
+
return _safe_read_text(export_path)
|
|
56
|
+
|
|
57
|
+
if lang:
|
|
58
|
+
md_path = index.translated_md_by_hash.get(source_hash, {}).get(lang.lower())
|
|
59
|
+
else:
|
|
60
|
+
md_path = index.md_path_by_hash.get(source_hash)
|
|
61
|
+
if not md_path:
|
|
62
|
+
return None
|
|
63
|
+
raw = _safe_read_text(md_path)
|
|
64
|
+
if lang:
|
|
65
|
+
raw = normalize_markdown_images(raw)
|
|
66
|
+
return raw
|
|
67
|
+
|
|
68
|
+
|
|
26
69
|
async def robots_txt(_: Request) -> Response:
|
|
27
70
|
"""Serve robots.txt to disallow all crawlers."""
|
|
28
71
|
return Response("User-agent: *\nDisallow: /\n", media_type="text/plain")
|
|
@@ -78,7 +121,13 @@ async def paper_detail(request: Request) -> HTMLResponse:
|
|
|
78
121
|
embed = request.query_params.get("embed") == "1"
|
|
79
122
|
|
|
80
123
|
pdf_path = index.pdf_path_by_hash.get(source_hash)
|
|
81
|
-
|
|
124
|
+
asset_urls = resolve_asset_urls(
|
|
125
|
+
index,
|
|
126
|
+
source_hash,
|
|
127
|
+
request.app.state.asset_config,
|
|
128
|
+
prefer_local=request.app.state.static_mode == "dev",
|
|
129
|
+
)
|
|
130
|
+
pdf_url = asset_urls["pdf_url"] or ""
|
|
82
131
|
source_available = source_hash in index.md_path_by_hash
|
|
83
132
|
translations = index.translated_md_by_hash.get(source_hash, {})
|
|
84
133
|
translation_langs = sorted(translations.keys(), key=str.lower)
|
|
@@ -156,14 +205,18 @@ async def paper_detail(request: Request) -> HTMLResponse:
|
|
|
156
205
|
|
|
157
206
|
# Initialize template variables
|
|
158
207
|
body_html = ""
|
|
159
|
-
raw_content = ""
|
|
160
208
|
summary_template_name = ""
|
|
161
209
|
template_warning = ""
|
|
162
210
|
template_controls = ""
|
|
163
211
|
source_path_str = ""
|
|
164
212
|
translated_path_str = ""
|
|
213
|
+
source_markdown_url = ""
|
|
214
|
+
translated_markdown_url = ""
|
|
215
|
+
images_base_url = asset_urls["images_base_url"] or ""
|
|
165
216
|
pdf_filename = ""
|
|
166
217
|
pdfjs_url = ""
|
|
218
|
+
pdfjs_script_url = ""
|
|
219
|
+
pdfjs_worker_url = ""
|
|
167
220
|
left_src = ""
|
|
168
221
|
right_src = ""
|
|
169
222
|
split_options: list[tuple[str, str]] = []
|
|
@@ -208,21 +261,27 @@ if (templateSelect) {{
|
|
|
208
261
|
</script>
|
|
209
262
|
"""
|
|
210
263
|
|
|
264
|
+
prefer_local = request.app.state.static_mode == "dev"
|
|
265
|
+
|
|
211
266
|
# Source view
|
|
212
267
|
if view == "source":
|
|
213
268
|
source_path = index.md_path_by_hash.get(source_hash)
|
|
214
|
-
if not source_path:
|
|
269
|
+
if not source_path or not asset_urls["md_url"]:
|
|
215
270
|
body_html = '<div class="warning">Source markdown not found. Provide --md-root to enable source viewing.</div>'
|
|
216
271
|
else:
|
|
217
|
-
|
|
218
|
-
raw = source_path.read_text(encoding="utf-8")
|
|
219
|
-
except UnicodeDecodeError:
|
|
220
|
-
raw = source_path.read_text(encoding="latin-1")
|
|
221
|
-
md_renderer = create_md_renderer()
|
|
222
|
-
body_html = render_markdown_with_math_placeholders(md_renderer, raw)
|
|
223
|
-
raw_content = raw
|
|
272
|
+
source_markdown_url = asset_urls["md_url"] or ""
|
|
224
273
|
source_path_str = str(source_path)
|
|
225
274
|
show_outline = True
|
|
275
|
+
if prefer_local:
|
|
276
|
+
raw = _load_markdown_for_view(
|
|
277
|
+
index,
|
|
278
|
+
request.app.state.asset_config,
|
|
279
|
+
request.app.state.static_export_dir,
|
|
280
|
+
source_hash,
|
|
281
|
+
)
|
|
282
|
+
if raw is not None:
|
|
283
|
+
md_renderer = create_md_renderer()
|
|
284
|
+
body_html = render_markdown_with_math_placeholders(md_renderer, raw)
|
|
226
285
|
|
|
227
286
|
# Translated view
|
|
228
287
|
if view == "translated":
|
|
@@ -230,38 +289,55 @@ if (templateSelect) {{
|
|
|
230
289
|
body_html = '<div class="warning">No translated markdown found. Provide <code>--md-translated-root</code> and place <code><base>.<lang>.md</code> under that root.</div>'
|
|
231
290
|
else:
|
|
232
291
|
translated_path = translations.get(selected_lang)
|
|
233
|
-
|
|
292
|
+
translated_markdown_url = asset_urls["md_translated_url"].get(selected_lang, "")
|
|
293
|
+
if not translated_path or not translated_markdown_url:
|
|
234
294
|
body_html = '<div class="warning">Translated markdown not found for the selected language.</div>'
|
|
235
295
|
else:
|
|
236
|
-
try:
|
|
237
|
-
raw = translated_path.read_text(encoding="utf-8")
|
|
238
|
-
except UnicodeDecodeError:
|
|
239
|
-
raw = translated_path.read_text(encoding="latin-1")
|
|
240
|
-
raw = normalize_markdown_images(raw)
|
|
241
|
-
md_renderer = create_md_renderer()
|
|
242
|
-
body_html = render_markdown_with_math_placeholders(md_renderer, raw)
|
|
243
|
-
raw_content = raw
|
|
244
296
|
translated_path_str = str(translated_path)
|
|
245
297
|
show_outline = True
|
|
298
|
+
if prefer_local:
|
|
299
|
+
raw = _load_markdown_for_view(
|
|
300
|
+
index,
|
|
301
|
+
request.app.state.asset_config,
|
|
302
|
+
request.app.state.static_export_dir,
|
|
303
|
+
source_hash,
|
|
304
|
+
lang=selected_lang,
|
|
305
|
+
)
|
|
306
|
+
if raw is not None:
|
|
307
|
+
md_renderer = create_md_renderer()
|
|
308
|
+
body_html = render_markdown_with_math_placeholders(md_renderer, raw)
|
|
246
309
|
|
|
247
310
|
# PDF view
|
|
248
311
|
if view == "pdf":
|
|
249
|
-
if not pdf_path:
|
|
312
|
+
if not pdf_path or not pdf_url:
|
|
250
313
|
body_html = '<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>'
|
|
251
314
|
pdf_filename = str(pdf_path.name) if pdf_path else ""
|
|
315
|
+
pdfjs_cdn_base_url = request.app.state.pdfjs_cdn_base_url
|
|
316
|
+
if pdfjs_cdn_base_url:
|
|
317
|
+
pdfjs_script_url = f"{pdfjs_cdn_base_url}/legacy/build/pdf.min.js"
|
|
318
|
+
pdfjs_worker_url = f"{pdfjs_cdn_base_url}/legacy/build/pdf.worker.min.js"
|
|
319
|
+
else:
|
|
320
|
+
pdfjs_script_url = "/pdfjs/build/pdf.js"
|
|
321
|
+
pdfjs_worker_url = "/pdfjs/build/pdf.worker.js"
|
|
252
322
|
|
|
253
323
|
# PDF.js view
|
|
254
324
|
if view == "pdfjs":
|
|
255
|
-
if not pdf_path:
|
|
325
|
+
if not pdf_path or not pdf_url:
|
|
256
326
|
body_html = '<div class="warning">PDF not found. Provide --pdf-root to enable PDF viewing.</div>'
|
|
257
|
-
pdfjs_url = build_pdfjs_viewer_url(
|
|
327
|
+
pdfjs_url = build_pdfjs_viewer_url(
|
|
328
|
+
pdf_url,
|
|
329
|
+
cdn_base_url=request.app.state.pdfjs_cdn_base_url,
|
|
330
|
+
)
|
|
258
331
|
pdf_filename = str(pdf_path.name) if pdf_path else ""
|
|
259
332
|
|
|
260
333
|
# Split view
|
|
261
334
|
if view == "split":
|
|
262
335
|
def pane_src(pane_view: str) -> str:
|
|
263
|
-
if pane_view == "pdfjs" and pdf_path:
|
|
264
|
-
return build_pdfjs_viewer_url(
|
|
336
|
+
if pane_view == "pdfjs" and pdf_path and pdf_url:
|
|
337
|
+
return build_pdfjs_viewer_url(
|
|
338
|
+
pdf_url,
|
|
339
|
+
cdn_base_url=request.app.state.pdfjs_cdn_base_url,
|
|
340
|
+
)
|
|
265
341
|
params: dict[str, str] = {"view": pane_view, "embed": "1"}
|
|
266
342
|
if pane_view == "summary" and template_param:
|
|
267
343
|
params["template"] = str(template_param)
|
|
@@ -307,12 +383,14 @@ if (templateSelect) {{
|
|
|
307
383
|
show_outline=show_outline,
|
|
308
384
|
# Content variables
|
|
309
385
|
body_html=body_html,
|
|
310
|
-
raw_content=raw_content,
|
|
311
386
|
summary_template_name=summary_template_name,
|
|
312
387
|
template_warning=template_warning,
|
|
313
388
|
template_controls=template_controls,
|
|
314
389
|
available_templates=available_templates,
|
|
315
390
|
selected_template_tag=selected_tag,
|
|
391
|
+
images_base_url=images_base_url,
|
|
392
|
+
source_markdown_url=source_markdown_url,
|
|
393
|
+
translated_markdown_url=translated_markdown_url,
|
|
316
394
|
# Source view
|
|
317
395
|
source_path=source_path_str,
|
|
318
396
|
# Translated view
|
|
@@ -322,6 +400,8 @@ if (templateSelect) {{
|
|
|
322
400
|
# PDF view
|
|
323
401
|
pdf_filename=pdf_filename,
|
|
324
402
|
pdf_url=pdf_url,
|
|
403
|
+
pdfjs_script_url=pdfjs_script_url,
|
|
404
|
+
pdfjs_worker_url=pdfjs_worker_url,
|
|
325
405
|
# PDF.js view
|
|
326
406
|
pdfjs_url=pdfjs_url,
|
|
327
407
|
# Split view
|
|
@@ -96,6 +96,64 @@ def normalize_markdown_images(text: str) -> str:
|
|
|
96
96
|
return "\n".join(out)
|
|
97
97
|
|
|
98
98
|
|
|
99
|
+
def normalize_fenced_code_blocks(text: str) -> str:
|
|
100
|
+
"""Ensure fenced code block markers appear on their own lines."""
|
|
101
|
+
fence_re = re.compile(r"(`{3,}|~{3,})")
|
|
102
|
+
out: list[str] = []
|
|
103
|
+
for line in text.splitlines():
|
|
104
|
+
match = fence_re.search(line)
|
|
105
|
+
if not match:
|
|
106
|
+
out.append(line)
|
|
107
|
+
continue
|
|
108
|
+
prefix = line[: match.start()]
|
|
109
|
+
suffix = line[match.start() :]
|
|
110
|
+
if prefix.strip():
|
|
111
|
+
out.append(prefix.rstrip())
|
|
112
|
+
out.append(suffix.lstrip())
|
|
113
|
+
else:
|
|
114
|
+
out.append(line)
|
|
115
|
+
return "\n".join(out)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def normalize_unbalanced_fences(text: str) -> str:
|
|
119
|
+
"""Drop unmatched opening fences so later content still renders."""
|
|
120
|
+
lines = text.splitlines()
|
|
121
|
+
out: list[str] = []
|
|
122
|
+
in_fence = False
|
|
123
|
+
fence_char = ""
|
|
124
|
+
fence_len = 0
|
|
125
|
+
fence_open_indices: list[int] = []
|
|
126
|
+
fence_re = re.compile(r"([`~]{3,})(.*)$")
|
|
127
|
+
|
|
128
|
+
for line in lines:
|
|
129
|
+
stripped = line.lstrip(" ")
|
|
130
|
+
leading_spaces = len(line) - len(stripped)
|
|
131
|
+
is_fence = False
|
|
132
|
+
if leading_spaces <= 3 and stripped:
|
|
133
|
+
match = fence_re.match(stripped)
|
|
134
|
+
if match:
|
|
135
|
+
run = match.group(1)
|
|
136
|
+
fence = run[0]
|
|
137
|
+
run_len = len(run)
|
|
138
|
+
if not in_fence:
|
|
139
|
+
in_fence = True
|
|
140
|
+
fence_char = fence
|
|
141
|
+
fence_len = run_len
|
|
142
|
+
fence_open_indices.append(len(out))
|
|
143
|
+
is_fence = True
|
|
144
|
+
elif fence == fence_char and run_len >= fence_len:
|
|
145
|
+
in_fence = False
|
|
146
|
+
fence_char = ""
|
|
147
|
+
fence_len = 0
|
|
148
|
+
is_fence = True
|
|
149
|
+
|
|
150
|
+
out.append(line)
|
|
151
|
+
|
|
152
|
+
if in_fence and fence_open_indices:
|
|
153
|
+
out.pop(fence_open_indices[-1])
|
|
154
|
+
return "\n".join(out)
|
|
155
|
+
|
|
156
|
+
|
|
99
157
|
def extract_math_placeholders(text: str) -> tuple[str, dict[str, str]]:
|
|
100
158
|
"""Extract math expressions and replace with placeholders."""
|
|
101
159
|
placeholders: dict[str, str] = {}
|
|
@@ -476,6 +534,8 @@ def extract_html_table_placeholders(text: str) -> tuple[str, dict[str, str]]:
|
|
|
476
534
|
|
|
477
535
|
def render_markdown_with_math_placeholders(md: MarkdownIt, text: str) -> str:
|
|
478
536
|
"""Render markdown with math, images, and tables properly escaped."""
|
|
537
|
+
text = normalize_fenced_code_blocks(text)
|
|
538
|
+
text = normalize_unbalanced_fences(text)
|
|
479
539
|
text = strip_paragraph_wrapped_tables(text)
|
|
480
540
|
text = normalize_footnote_definitions(text)
|
|
481
541
|
rendered, table_placeholders = extract_html_table_placeholders(text)
|